# install.packages("tidyverse");
# install.packages("rgdal");
library(tidyverse)
require("maps")
library(geosphere)
library(stringr)
library(rgdal)
library(caret)
library(lubridate)
if (!require(ggmap)) { install.packages('ggmap'); require(ggmap) }
library(ggmap)
path.to.csv <- '../Milestone 2/Seattle_Police_Department_911_Incident_Response_Oct17.csv'
spd.911 <- read.csv(path.to.csv, TRUE)
spd.911$clearance_date_ts = as.POSIXct(strptime(spd.911$Event.Clearance.Date, "%m/%d/%Y %I:%M:%S %p"))
spd.911$clearance_date_date = as.Date(spd.911$clearance_date_ts)
View(spd.911)
# path to the FOLDER with the .shp file in it. the second param is the name of the .shp file
# seattle <- readOGR(dsn = path.expand("~/documents/INFO370/project-teamname-v2/maps-api-test"), layer = "Seattle_City_Limits")
# usa <- map_data("state")
# data <- merge(usa, spd.911)
# Red Square coordinates
here_long <- -122.3095
here_lat <- 47.6560
seattle = get_map(location = c(here_long, here_lat), zoom = 13, maptype = 'roadmap')
Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=47.656,-122.3095&zoom=13&size=640x640&scale=2&maptype=roadmap&language=en-EN&sensor=false

freq_by_desc <- table(droplevels(data$Event.Clearance.Description))
# View(freq_by_desc)
ggplot(as.data.frame(freq_by_desc),
aes(x = Var1, y = Freq)) +
geom_bar(stat = 'identity') +# create bar plot
coord_flip()

#Traffic related calls, suspicious circumstances, and disturbances are the the most significant threats to pedestrations
ggmap(seattle) +
geom_point(data = data, aes(x = Longitude, y = Latitude, group = Event.Clearance.Description, color = Event.Clearance.Description), alpha = 0.5) +
facet_wrap(~ Event.Clearance.Description) +
theme(axis.ticks = element_blank(),
axis.text.x = element_blank(),
axis.text.y = element_blank(),
legend.position = "none"
)

# selecting just ID and location data
df_loc <- data %>% dplyr::select(CAD.CDW.ID, Longitude, Latitude)
# figuring out number of clusters
wss <- c()
# clusters 1 to 15
for (i in 1:15) {
wss[i] <- sum(kmeans(df_loc, centers=i)$withinss)
}
plot(1:15, wss, type="b", xlab="Number of Clusters",
ylab="Within groups sum of squares")

# fitting model
fit <- kmeans(df_loc, 10)
fit$centers # look at cluster sizes and means. want clusters to be about equal size
CAD.CDW.ID Longitude Latitude
1 2110534 -122.3090 47.65473
2 2108019 -122.3145 47.65467
3 2114454 -122.3135 47.66288
4 2115894 -122.3117 47.66063
5 2117445 -122.3115 47.66015
6 2105779 -122.3122 47.65821
7 2119756 -122.3128 47.65627
8 2122787 -122.3178 47.66599
9 2112589 -122.3124 47.66454
10 2124681 -122.3089 47.65948
fit$cluster
[1] 6 6 6 6 6 6 6 6 6 2 2 2 2 2 2 1 1 1 1 1 1 1 9 9 9 9 9 9 9 9 9 9 3 3
[35] 3 3 3 3 3 3 3 3 4 4 4 4 4 4 4 4 4 4 4 4 5 5 5 5 5 5 7 7 7 7 7 7 7 7
[69] 7 7 7 7 7 8 8 8 8 8 8 8 8 8 8 10 10 10 10 10 10 10 10 10
cluster.size <- data.frame(1:10, fit$size)
cluster.size
ggplot(data = cluster.size, aes(x = X1.10, y = fit.size)) +
geom_bar(stat = 'identity')

ggplot()

ggmap(seattle) +
geom_point(data = as.data.frame(fit$centers), aes(x = Longitude, y = Latitude), alpha = 0.5)

# looking at cluster means
aggregate(df_loc, by=list(fit$cluster), FUN=mean)
df_loc
# adding data back into dataframe
# df_loc <- df_loc %>% mutate(cluster = fit$cluster)
data$cluster <- fit$cluster
# View(data)
# timestamp -> year month day hour minute
# sector -> to factor (there are 17 sectors)
# beat -> to factor (there are 3 beats per sector)
# clean the data a bit more
data$event_clearance_ts = as.POSIXct(strptime(data$Event.Clearance.Date, "%m/%d/%Y %I:%M:%S %p"))
data$event_clearance_date = as.Date(data$event_clearance_ts)
data$event_clearance_month = month(ymd_hms(as.character(data$event_clearance_ts)))
data$event_clearance_day = weekdays(data$event_clearance_date)
data$event_clearance_hr = hour(ymd_hms(as.character(data$event_clearance_ts)))
data$event_clearance_mn = minute(ymd_hms(as.character(data$event_clearance_ts)))
data$Initial.Type.Group = factor(data$Initial.Type.Group)
data$Event.Clearance.Group = factor(data$Event.Clearance.Group)
data$Zone.Beat = factor(data$Zone.Beat)
data$District.Sector = factor(data$District.Sector)
data$event_clearance_day = factor(data$event_clearance_day)
data
col.names <- paste(c(
"Event.Clearance.Code"
, "cluster"
, "Census.Tract"
, "event_clearance_day"
, "Event.Clearance.Group"
, "Event.Clearance.SubGroup"
, "District.Sector"
, "Zone.Beat"
#, "event_clearance_ts"
# ,"Incident.Location"
, "event_clearance_hr"
, "event_clearance_mn"
, "event_clearance_month"
, "Hundred.Block.Location"
), collapse="|")
cols <- grep(col.names, colnames(data))
cols
[1] 4 6 7 9 10 11 12 23 26 27 28 29
# corr_matrix <- cor(data[,cols]) # correlations between all predictor vars
# corr_matrix
# cutoff <- 0.5 # should be higher in practice
# highly_corr <- findCorrelation(corr_matrix, cutoff=cutoff)
# print(colnames(spd.911)[highly_corr]) # age is highly correalted with pregnant
train.data <- select(data, cols)
train.data
# data <- data %>% droplevels()
# grep("Hundred.Block.Location", colnames(train.data), invert = T)
predictors <- grep("Hundred.Block.Location", colnames(train.data), invert = T)
outcome <- grep("Hundred.Block.Location", colnames(train.data))
# train.data[,predictors]
frame <- data.frame(train.data[,predictors])
frame
out.factor <- train.data$Hundred.Block.Location
as.vector(out.factor)
[1] "43XX BLOCK OF 15 AV NE" "24XX BLOCK OF E LOUISA ST"
[3] "48XX BLOCK OF SAND POINT WY NE" "42XX BLOCK OF UNIVERSITY WY NE"
[5] "21XX BLOCK OF N NORTHLAKE WY" "14XX BLOCK OF NE 43 ST"
[7] "14XX BLOCK OF NE 43 ST" "17XX BLOCK OF N 45 ST"
[9] "NE 54 ST / 21 AV NE" "29XX BLOCK OF FAIRVIEW AV E"
[11] "SAND POINT WY NE / 40 AV NE" "55XX BLOCK OF 12 AV NE"
[13] "NE 43 ST / UNIVERSITY WY NE" "E ROANOKE ST / I5 NB"
[15] "E HAMLIN ST / EASTLAKE AV E" "23XX BLOCK OF 24 AV E"
[17] "50XX BLOCK OF UNIVERSITY WY NE" "E MONTLAKE PL E / E LAKE WASHINGTON BV"
[19] "14XX BLOCK OF NE 43 ST" "16XX BLOCK OF INTERLAKEN PL E"
[21] "8XX BLOCK OF NE 66 ST" "NE 47 ST / 17 AV NE"
[23] "45XX BLOCK OF 16 AV NE" "16XX BLOCK OF NE 50 ST"
[25] "45XX BLOCK OF UNIVERSITY WY NE" "47XX BLOCK OF UNIVERSITY WY NE"
[27] "NE 52 ST / ROOSEVELT WY NE" "NE 50 ST / 9 AV NE"
[29] "45XX BLOCK OF 25 AV NE" "61XX BLOCK OF BROOKLYN AV NE"
[31] "47XX BLOCK OF UNIVERSITY WY NE" "45XX BLOCK OF UNIVERSITY WY NE"
[33] "15 AV NE / NE 55 ST" "NE 45 ST / 11 AV NE"
[35] "46XX BLOCK OF 27 AV NE" "2 AV NE / NE 50 ST"
[37] "11 AV NE / NE 42 ST" "50XX BLOCK OF 7 AV NE"
[39] "NE BLAKELEY ST / 25 AV NE" "4213 1 / 2 UNIVERSITY WY NE"
[41] "15 AV NE / NE 42 ST" "NE 47 ST / 9 AV NE"
[43] "NE 45 ST / 5 AV NE" "NE 52 ST / 15 AV NE"
[45] "47XX BLOCK OF 30 AV NE" "UNIVERSITY WY NE / NE 56 ST"
[47] "ROOSEVELT WY NE / NE 45 ST" "50XX BLOCK OF UNIVERSITY WY NE"
[49] "41XX BLOCK OF UNIVERSITY WY NE" "45XX BLOCK OF 15 AV NE"
[51] "27XX BLOCK OF MONTLAKE BV E" "14XX BLOCK OF N 42 ST"
[53] "38XX BLOCK OF 42 AV NE" "42XX BLOCK OF UNIVERSITY WY NE"
[55] "48XX BLOCK OF SAND POINT WY NE" "24XX BLOCK OF E LOUISA ST"
[57] "37XX BLOCK OF CORLISS AV N" "45XX BLOCK OF 18 AV NE"
[59] "ROOSEVELT WY NE / NE 63 ST" "NE 50 ST / 2 AV NE"
[61] "45XX BLOCK OF 25 AV NE" "48XX BLOCK OF SAND POINT WY NE"
[63] "45XX BLOCK OF 9 AV NE" "14XX BLOCK OF N 45 ST"
[65] "15 AV NE / NE 50 ST" "23XX BLOCK OF MINOR AV E"
[67] "24 AV E / E MONTLAKE PL E" "50XX BLOCK OF ROOSEVELT WY NE"
[69] "8XX BLOCK OF NE 42 ST" "40XX BLOCK OF UNIVERSITY WY NE"
[71] "38XX BLOCK OF NE 57 ST" "9XX BLOCK OF E ROANOKE ST"
[73] "23XX BLOCK OF EASTLAKE AV E" "BURKE AV N / N NORTHLAKE WY"
[75] "22XX BLOCK OF NE 51 ST" "52XX BLOCK OF 22 AV NE"
[77] "N 45 ST / BURKE AV N" "LATONA AV NE / NE 58 ST"
[79] "1 AV NE / N 56 ST" "ROOSEVELT WY NE / NE 65 ST"
[81] "43XX BLOCK OF UNIVERSITY WY NE" "65XX BLOCK OF 18 AV NE"
[83] "50XX BLOCK OF 19 AV NE" "EASTLAKE AV E / E BOSTON ST"
[85] "45XX BLOCK OF 12 AV NE" "15 AV NE / NE 50 ST"
[87] "14XX BLOCK OF NE 43 ST" "22 AV E / E MILLER ST"
[89] "50XX BLOCK OF RAVENNA AV NE" "52XX BLOCK OF 15 AV NE"
[91] "55XX BLOCK OF 17 AV NE" "39 AV NE / NE 55 ST"
control <- rfeControl(functions = rfFuncs, method="cv", number=10)
results <- rfe(frame, out.factor, sizes = c(1:13), rfeControl = control) # this will take AWHILE...
Error in { : task 1 failed - "Can't have empty classes in y."
---
title: "R Notebook"
output: html_notebook
---

```{r setup}
# install.packages("tidyverse");
# install.packages("rgdal");
library(tidyverse)
require("maps")
library(geosphere)
library(stringr)
library(rgdal)
library(caret)
library(lubridate)
if (!require(ggmap)) { install.packages('ggmap'); require(ggmap) }
library(ggmap)
path.to.csv <- '../Milestone 2/Seattle_Police_Department_911_Incident_Response_Oct17.csv'
spd.911 <- read.csv(path.to.csv, TRUE)

spd.911$clearance_date_ts = as.POSIXct(strptime(spd.911$Event.Clearance.Date, "%m/%d/%Y %I:%M:%S %p"))
spd.911$clearance_date_date = as.Date(spd.911$clearance_date_ts)
View(spd.911)



# path to the FOLDER with the .shp file in it. the second param is the name of the .shp file
# seattle <- readOGR(dsn = path.expand("~/documents/INFO370/project-teamname-v2/maps-api-test"), layer = "Seattle_City_Limits")

# usa <- map_data("state")
# data <- merge(usa, spd.911)
# Red Square coordinates
here_long <-  -122.3095
here_lat <- 47.6560

seattle = get_map(location = c(here_long, here_lat), zoom = 13, maptype = 'roadmap')

```


```{r}


spd.911 <- spd.911 %>% 
             rowwise() %>% 
             mutate(dist=distVincentyEllipsoid(c(Longitude, Latitude), c(here_long, here_lat)))              
nrow(spd.911)

descriptions <- c("STRONG ARM ROBBERY", "PERSON WITH A WEAPON (NOT GUN)", "HAZARDS", "HARASSMENT, THREATS", "FIGHT DISTURBANCE", "CRISIS COMPLAINT - GENERAL", "ARMED ROBBERY")

# Removes Specifically Harassment by Telephone and Writing, as well as other non-scary crimes
data.ped <- spd.911 %>% filter(str_detect(Event.Clearance.Description, paste(descriptions, collapse="|"))) %>% filter(!str_detect(Event.Clearance.Description, "HARASSMENT, THREATS - BY TELEPHONE, WRITING"))
# data.ped <- data.now
nrow(data.ped)

# data.now <- data.ped %>% filter(clearance_date_ts < '2017-10-31 00:00:00')
nrow(data.now)
                  
data.here <- data.now %>% filter(dist < 2600)

data <- read.csv('YearSEAPD.csv', header = TRUE)
nrow(data)
# View(data)

ggmap(seattle) +
   geom_point(data = data, aes(x = Longitude, y = Latitude), colour = "red", alpha = 0.75)
  #coord_map()

```

```{r}
freq_by_desc <- table(droplevels(data$Event.Clearance.Description))
# View(freq_by_desc)

ggplot(as.data.frame(freq_by_desc), 
       aes(x = Var1, y = Freq)) +
       geom_bar(stat = 'identity') +# create bar plot
    coord_flip()

#Traffic related calls, suspicious circumstances, and disturbances are the the most significant threats to pedestrations

        
```

```{r}
ggmap(seattle) +
  geom_point(data = data, aes(x = Longitude, y = Latitude, group = Event.Clearance.Description, color = Event.Clearance.Description), alpha = 0.5) +
  facet_wrap(~ Event.Clearance.Description) +
  theme(axis.ticks = element_blank(), 
        axis.text.x = element_blank(),
        axis.text.y = element_blank(),
        legend.position = "none"
        )
```

```{r}
# selecting just ID and location data
df_loc <- data %>% dplyr::select(CAD.CDW.ID, Longitude, Latitude)

# figuring out number of clusters
wss <- c()
# clusters 1 to 15
for (i in 1:15) {
  wss[i] <- sum(kmeans(df_loc, centers=i)$withinss)
}
plot(1:15, wss, type="b", xlab="Number of Clusters",
  ylab="Within groups sum of squares")

# fitting model
fit <- kmeans(df_loc, 10)
fit$centers # look at cluster sizes and means. want clusters to be about equal size
fit$cluster
cluster.size <- data.frame(1:10, fit$size)
cluster.size

ggplot(data = cluster.size, aes(x = X1.10, y = fit.size)) +
  geom_bar(stat = 'identity')
ggplot()
ggmap(seattle) +
  geom_point(data = as.data.frame(fit$centers), aes(x = Longitude, y = Latitude), alpha = 0.5)
# looking at cluster means
aggregate(df_loc, by=list(fit$cluster), FUN=mean)

df_loc

# adding data back into dataframe 
# df_loc <- df_loc %>% mutate(cluster = fit$cluster) 
data$cluster <- fit$cluster

# View(data)
```

```{r}
# timestamp ->  year  month day hour  minute
# sector -> to factor (there are 17 sectors)
# beat -> to factor (there are 3 beats per sector)

# clean the data a bit more
data$event_clearance_ts = as.POSIXct(strptime(data$Event.Clearance.Date, "%m/%d/%Y %I:%M:%S %p"))
data$event_clearance_date = as.Date(data$event_clearance_ts)
data$event_clearance_month = month(ymd_hms(as.character(data$event_clearance_ts)))
data$event_clearance_day = weekdays(data$event_clearance_date)
data$event_clearance_hr = hour(ymd_hms(as.character(data$event_clearance_ts)))
data$event_clearance_mn = minute(ymd_hms(as.character(data$event_clearance_ts)))
data$Initial.Type.Group = factor(data$Initial.Type.Group)
data$Event.Clearance.Group = factor(data$Event.Clearance.Group)
data$Zone.Beat = factor(data$Zone.Beat)
data$District.Sector = factor(data$District.Sector)
data$event_clearance_day = factor(data$event_clearance_day)
data

col.names <- paste(c(
  "Event.Clearance.Code"
  , "cluster"
  , "Census.Tract"
  , "event_clearance_day"
  , "Event.Clearance.Group"
  , "Event.Clearance.SubGroup"
  , "District.Sector"
  , "Zone.Beat"
  #, "event_clearance_ts"
  # ,"Incident.Location"
  , "event_clearance_hr"
  , "event_clearance_mn"
  , "event_clearance_month" 
  , "Hundred.Block.Location"
  ), collapse="|")
cols <- grep(col.names, colnames(data))
cols
# corr_matrix <- cor(data[,cols]) # correlations between all predictor vars
# corr_matrix

# cutoff <- 0.5 # should be higher in practice

# highly_corr <- findCorrelation(corr_matrix, cutoff=cutoff)
# print(colnames(spd.911)[highly_corr]) # age is highly correalted with pregnant

train.data <- select(data, cols)
train.data
# data <- data %>% droplevels()

# grep("Hundred.Block.Location", colnames(train.data), invert = T)

predictors <- grep("Hundred.Block.Location", colnames(train.data), invert = T)
outcome <- grep("Hundred.Block.Location", colnames(train.data))

# train.data[,predictors]
frame <- data.frame(train.data[,predictors])
frame
out.factor <- train.data$Hundred.Block.Location
as.vector(out.factor)


control <- rfeControl(functions = rfFuncs, method="cv", number=10)
results <- rfe(frame, out.factor, sizes = c(1:13), rfeControl = control) # this will take AWHILE...

results
ggplot(results)

# chosen features
predictors(results)
```
